rm(list=ls())
require(quanteda)
require(LSS)
quanteda_options(threads = 8)
source("functions.R")

toks <- readRDS("data_tokens_en.RDS") %>% 
    tokens_remove('^[\\p{N}\\p{P}]+$', valuetype = 'regex')
dfmt <- dfm(toks, remove = "") %>% 
    dfm_trim(min_termfreq = 10) %>% 
    dfm_group()

dat <- docvars(dfmt)

# LSS
lss <- readRDS("lss_en.RDS")
dat$lss <- predict(lss, newdat = dfmt)

# LSD
dfmt_dict <- dfm(tokens_lookup(toks, data_dictionary_LSD2015[1:2])) %>% 
        dfm_group()
dat$lsd <- rowSums(dfmt_dict[,2] - dfmt_dict[,1]) / rowSums(dfmt)

# GDP
dat_gdp <- read.csv("data/united-states-gdp-growth-rate.csv")
dat_gdp$date <- as.Date(dat_gdp$date)
dat_gdp$year <- lubridate::year(dat_gdp$date)
dat_gdp <- subset(dat_gdp, as.Date("1981-01-01") <= date & date <= as.Date("2008-12-31"))

dat_mean <- aggregate(list(lss = dat$lss, lsd = dat$lsd), 
                     by = list(year = dat$year), FUN = mean, na.rm = TRUE)
dat_n <- aggregate(list(n = dat$year), 
                   by = list(year = dat$year), FUN = length)
dat_gdp <- merge(dat_gdp, dat_mean, all.x = TRUE, all.y = FALSE)
dat_gdp <- merge(dat_gdp, dat_n, all.x = TRUE, all.y = FALSE)
dat_gdp$lss <- scale(dat_gdp$lss)
dat_gdp$lsd <- scale(dat_gdp$lsd)

saveRDS(dat_gdp, "data_gdp.RDS")
